### Replication Package for "Why is Intermediating Houses so Difficult? Evidence from iBuyers"
### Buchak, Matvos, Piskorski, and Seru
###
###
### buchak@stanford.edu

### This file creates the seller/mover analysis data for use in the model validation appendix

library(data.table)
library(ggplot2)
library(zoo)
library(Hmisc)
library(stringr)

### Toggle to T to use sharable sample data ###
SHARABLE = T 

raw.data.path = '../data/raw/corelogic/'
processed.data.path = '../data/processed/corelogic'
sharable.processed.data.path = '../data/processed/corelogic/share/'
sharable.raw.data.path = '../data/raw/corelogic/share/'


createSellerPanel <- function() {
  
  
  # Create a panel that is person x year.
  transactions <- loadData(SAMPLE = SHARABLE) # Get the cleaned up corelogic transactions
  
  # Create cleaned buyername
  transactions[,buyername_cleaned := trimws(paste(owner1firstnamemi,owner1lastname))]
  transactions[,buyername_cleaned := trimws(gsub(pattern = '[^[:alnum:] ]',x = buyername_cleaned,replace = ''))]
  
  # All people who owned properties. First, filter out corporations and 
  transactions[,n := sum(absenteeownerstatus != 'Y'),by='buyername_cleaned'] # If you have a ton of absentee owner stuff we're skeptical.....
  transactions[,zip5 := substr(situszipcode,1,5)]
  transactions[,buy_year := year]
  transactions$year <- NULL
  
  # Get rid of corporations, people with transactions > 10, and guys with corporate names.
  corpNames = c('[0-9]|ARIZONA|CORP|LLC|INC|LP|SALES|CONSTRUCTION|TRUST|INC|unknown|HOMES|FNAM|DEV|JR|HUD|LTD|INS|MAE|ASSN|MARKETING|RECONVEYANCE|BK|FSB|CORPORATION|PROPERTIES|ADMN|ASSOCIATES|BANK|AMERICA|CORPS|DEVELOPMENT|AFFAIRS|INVESTMENTS|COUNTY|CORPORAT|HMS')
  transactions[,corpname := grepl(x = buyername_cleaned,pattern = corpNames,ignore.case = T)]  
  
  realPeople <- transactions[buyername_cleaned != '' & str_length(owner1lastname) > 1 & str_length(owner1firstnamemi) > 1  & str_length(buyername_cleaned) > 4 & 
                               absenteeownerstatus != 'Y' & corpname == F & corporateindicator == '' & n <= 10 & buyername_cleaned != '' & str_length(buyername_cleaned) > 4, # Need at last 2 first-last...
                             c('fipscode','Market','zip5','buyername_cleaned','pclidirisfrmtd','Market','buy_year'),with=F]
  
  
  # Panel is person-id, year, where person is NAME-FIPS-YEAR (e.g., `Shaw' transactors in Atlanta in 2015)
  realPeople[,name := buyername_cleaned]
  realPeople[,n_person_fips_year := .N,by=c('name','fipscode','buy_year')]
  
  # Exclude undifferentiated people, i.e., non-unique person/fips/years This is about 90% of rows.
  mean(realPeople$n_person_fips_year==1)
  uniquePeopleFips <- realPeople[n_person_fips_year == 1]
  
  # Unique people
  myPeople <- uniquePeopleFips[!duplicated(paste(name,fipscode)),c('name','fipscode')]
  
  # Create the panel.
  people.years <- data.table(merge(data.frame(myPeople),data.frame(year = 1990:2018)),by=NULL)
  people.years <- people.years[order(name,fipscode,year)]
  
  # Get the last transaction for each pclidiris for each year.
  # You are the owner of house A if you were the last transaction in a given year
  transactions[,last.yearly.transaction := max(date),by=c('fipscode','pclidirisfrmtd','buy_year')]
  last.yearly <- transactions[date == last.yearly.transaction ]
  
  # Get the biggest sale amount
  last.yearly[,max.sale.amount := max(saleamount),by=c('fipscode','pclidirisfrmtd','buy_year')]
  last.yearly <- last.yearly[saleamount == max.sale.amount]
  
  # Drop if the record is duplicated
  last.yearly <- last.yearly[!duplicated(paste(fipscode,pclidirisfrmtd,buy_year))]
  
  # Detect when the property is next sold
  last.yearly <- last.yearly[order(fipscode,pclidirisfrmtd,buy_year)]
  last.yearly[,name := buyername_cleaned]
  
  # Here's how the merge works: We're trying to figure out where everyone is living at each time. We first merge on fips-name. This gives all the properties that person was in 
  # at that time. From transactions we need:
  # - pclidiris
  # - fips
  # - name
  # - buy_date
  # - sell_date (date of next transaction, if it occurs.)
  # - absentee owner status---we want to know if you leave the house b/c it's being an absentee owner, but we don't want to count you as living there when we figure out the living place.
  last.yearly[,sell_year := shift(buy_year,1,type='lead'),by=c('pclidirisfrmtd','fipscode')]
  toMerge <- last.yearly[,c('fipscode','Market','pclidirisfrmtd','name','buy_year','sell_year','absenteeownerstatus')]
  
  # This is thus merging every guy with every property he's ever owned.
  step.1 <- merge(people.years,toMerge,by=c('fipscode','name'),all.x=T,allow.cartesian = T)
  
  # Next, screen out absentee owner places
  step.2 <- step.1[absenteeownerstatus != 'Y']
  step.2$absenteeownerstatus <- NULL
  
  # Next, screen out places where year > sell_year or year <= buy_year (assumption is that you MOVE IN the year after you buy (like you buy on dec 31)) and you MOVE
  # OUT the year after you sell (e.g., if you sell on Dec 31, 2016, you still live in the house for 2016).
  step.2[is.na(sell_year),sell_year := 9999]
  step.3 <- step.2[year > buy_year & year <= sell_year ] # You move in after you buy, you stay while you sell.
  
  # Finally, it may be that you did this:
  # Buy A at 2010
  # Buy B at 2012
  # Sell A at 2013
  #
  # We'd get two observations for this individual in 2013, saying he lived in A and saying he lived in B
  # To break these ties, we want to figure out, of the remaining places, at year name-year, what's the latest house you bought?
  # We'll then say, okay, you live in the last house you bought
  
  step.3[,last.bought := max(buy_year),by=c('fipscode','name','year')]
  step.4 <- step.3[buy_year == last.bought]
  
  # There are some remaining double matches.
  # Some of these are unformatted pclidirisfrmteds, like "300-08-936-E" versus "300-08-936E"
  step.4[,id.temp := str_replace_all(pclidirisfrmtd,pattern='-',replacement='')]
  step.4 <- step.4[order(fipscode,name,year,pclidirisfrmtd)]
  step.5 <- step.4[!duplicated(paste(fipscode,name,year,id.temp))] # Gets rid of the straight-up duplicated IDs
  
  # Now there are double matches that are okay, e.g., if he bought two houses at the same time ...
  # Break ties in favor of where you stay longer.
  step.5[,last.sold := max(sell_year),by=c('fipscode','name','year')]
  step.6 <- step.5[sell_year == last.sold]
  
  # At this point there are 0.8% that have double things. Just pick the first one.
  step.7 <- step.6[!duplicated(paste(fipscode,name,year))]
  
  # Now just merge my fixed up one onto the blank one and preserve NAs, and remove everyone who NEVER shows up in my sample (e.g., they buy and sell before the sample starts)
  balanced <- merge(people.years,step.7,all.x=T,by=c('fipscode','name','year'))
  balanced <- balanced[,a := sum(!is.na(buy_year)),by=c('fipscode','name')]
  balanced <- balanced[a >= 1]
  balanced$a <- NULL
  
  # Remove nuisance columns
  balanced$last.bought <- NULL
  balanced$last.sold <- NULL
  balanced$id.temp <- NULL
  
  # Add lagging variables
  balanced <- balanced[order(fipscode,name,year)]
  balanced[sell_year == 9999,sell_year := NA]
  balanced[,pclidirisfrmtd.next := shift(pclidirisfrmtd,1,type='lead'),by=c('fipscode','name')]
  balanced[,buy_year.next := shift(buy_year,1,type='lead'),by=c('fipscode','name')]
  
  # Transacting variable---
  # You are selling on the year the house sells.But if you have NA in 2015 then house in 2016, you're "buying" in 2016
  balanced[,transaction := as.integer(is.na(pclidirisfrmtd) & !is.na(pclidirisfrmtd.next) | 
                                        is.na(pclidirisfrmtd.next) & !is.na(pclidirisfrmtd) | 
                                        pclidirisfrmtd != pclidirisfrmtd.next)]
  balanced[year == 2018 & is.na(sell_year),transaction := 0] # In 2018 because of truncation, you only countas selling if sell year == 2018.............
  balanced[year == 2018 & sell_year == 2018,transaction := 1] # this is the only way. 
  balanced[year == 2018 & is.na(pclidirisfrmtd) & is.na(pclidirisfrmtd.next),transaction := NA]
  balanced[year == 2018 & is.na(pclidirisfrmtd) & !is.na(pclidirisfrmtd.next),transaction := 1]
  # balanced <- balanced[year != 2018] # What we're saying makes no sense in 2018. 
  
  data.in <- balanced
  
  # Make the data at the person-house level
  spells <- data.in[!is.na(pclidirisfrmtd) ]
  
  spells[,tenure := year - buy_year - 1]
  spells[,sells := as.integer(!is.na(pclidirisfrmtd) & transaction == 1)]
  spells$transaction <- NULL
  
  
  # Get last yearly transactions
  last.yearly <- getLastYearlyTransaction()
  owner.merge <- last.yearly[,c('fipscode','pclidirisfrmtd','buy_year','saleamount','mortgageamount','mortgageloantypecode','iBuyer.seller','living_sqft','land_sqft','yearbuilt','multistory')]
  next.merge  <- last.yearly[,c('fipscode','pclidirisfrmtd','buy_year','saleamount','mortgageamount','mortgageloantypecode','iBuyer.seller','living_sqft','land_sqft','yearbuilt','multistory')]
  buyer.merge <- last.yearly[,c('fipscode','pclidirisfrmtd','buy_year','saleamount','mortgageamount','mortgageloantypecode','iBuyer.buyer')]
  
  # Want to do 3 merges
  # 1. Info about the house he lives in
  # 2. Info about who buys the house
  # 3. Info about the next house.
  
  # Need to include hedonics --- size, stories, age.
  names(owner.merge) <- c('fipscode','pclidirisfrmtd','buy_year','saleamount.this','mortgageamount.this','mortgageloantypecode.this','iBuyer.seller.this','living.sqft.this','land.sqft.this','yearbuilt.this','multistory.this')
  names(next.merge)  <- c('fipscode','pclidirisfrmtd','buy_year','saleamount.next','mortgageamount.next','mortgageloantypecode.next','iBuyer.seller.next','living.sqft.next','land.sqft.next','yearbuilt.next','multistory.next')
  names(buyer.merge) <- c('fipscode','pclidirisfrmtd','buy_year','saleamount.next.buyer','mortgageamount.next.buyer','mortgageloantypecode.next.buyer','iBuyer.next.buyer')
  
  spells <- merge(spells,owner.merge,by.x=c('fipscode','pclidirisfrmtd','buy_year'),by.y=c('fipscode','pclidirisfrmtd','buy_year'),all.x=T)
  spells <- merge(spells,next.merge ,by.x=c('fipscode','pclidirisfrmtd.next','buy_year.next'),by.y=c('fipscode','pclidirisfrmtd','buy_year'),all.x=T)
  spells <- merge(spells,buyer.merge,by.x=c('fipscode','pclidirisfrmtd','sell_year'),by.y=c('fipscode','pclidirisfrmtd','buy_year'),all.x=T)
  
  spells <- spells[order(fipscode,name,year)]

  
  if(!SHARABLE) {
    write.table(spells,paste0(processed.data.path,'person_panel_2000_2018_spell_details.csv'),row.names=F,col.names=T)
  } else {
    write.table(spells,paste0(sharable.processed.data.path,'person_panel_2000_2018_spell_details.csv'),row.names=F,col.names=T)
  }
  
  
  
} 

loadData <- function(arms.length.only = T,SAMPLE=T) {
  # Main data loader / prepper function
  if(!SAMPLE) {
    data.in <- fread('../data/processed/corelogic/Combined_processed.csv')
  } else {
    data.in <- fread('../data/processed/corelogic/share/Combined_processed.csv')
  }
  
  if(arms.length.only) {
    data.in <- data.in[propertytype == '10' & transactiontype == '1']
  } else {
    data.in <- data.in[propertytype == '10']
  }
  
  # Fix dates
  data.in[,date := as.Date(date)]
  data.in[,seller.buy.date := as.Date(seller.buy.date)]
  
  # Create some fields
  data.in[,zip5 := as.numeric(substr(situszipcode,1,5))]
  data.in[,qtr := as.yearqtr(date)]
  
  # Merge with census with geographical characteristics
  census.in <- loadCensusData()
  census.in[,income.bin := cut(median.household.income,breaks = c(0,30000,50000,70000,90000,110000,1e999),labels = c('<30k','30-50k','50-70k','70-90k','90-110k','110k+'))]
  census.in[,income.bin := factor(income.bin,levels(income.bin)[c(6,1:5)])]
  
  census.in[,college.bin := cut(pct.bachelors,breaks = c(0,.15,.30,.45,.60,1),labels=c('<15%','15-30%','30-45%','45-60%','60%+'))]
  census.in[,college.bin := factor(college.bin,levels(college.bin)[c(5,1:4)])]
  
  census.in[,median.age.bin := cut(median.age ,breaks = c(0,30,40,50,60,200),labels=c('<30','30-40','40-50','50-60','60+'))]
  census.in[,median.age.bin := factor(median.age.bin,levels(median.age.bin)[c(5,1:4)])]
  
  data.in <- merge(data.in,census.in,by.x='zip5',by.y='zip5')
  
  data.in[,living_sqft := as.numeric(living_sqft)]
  data.in[,land_sqft := as.numeric(landsquarefootage)]  
  data.in[,house.age := year - as.numeric(yearbuilt)]
  
  
  data.in <- data.in[is.na(saleamount) | saleamount < 10000000 & land_sqft < 50000]
  
  # Create bins on house characteristics
  data.in[,price.bin := cut(saleamount,breaks = c(0,100000,250000,500000,1000000,1e999))]
  data.in[,price.bin := factor(price.bin,levels(price.bin)[c(5,1:4)])]
  
  data.in[,age.bin := cut(house.age,breaks = c(0,5,15,50,1000000))]
  data.in[,age.bin := factor(age.bin,levels(age.bin)[c(4,1:3)])]
  
  data.in[,size.bin := cut(land_sqft,breaks = c(0,5000,10000,25000,1000000))]
  data.in[,size.bin := factor(size.bin,levels(size.bin)[c(4,1:3)])]
  
  data.in[,multistory := as.integer(storiesnum > 1)]
  
  data.in[,seller.ltv.bin := cut(seller.ltv,breaks = c(0,.25,.5,.75,.80,.85,.9,.95,100),labels = c('0-0.25','0.25-0.50','0.50-0.75','0.75-0.80','0.80-0.85','0.85-0.90','0.90-0.95','0.95-1.00'))]
  
  
  # Variable called ibuyer, which is just buyer OR seller
  data.in[,iBuyer := as.integer(iBuyer.buyer==1 | iBuyer.seller==1)]
  
  
  # Tenure
  data.in[,seller.tenure := as.numeric(date-seller.buy.date)]
  
  
  return(data.in)
  
}

loadCensusData <- function() {
  census.in <- fread('../data/raw/census/nhgis0023_ds201_20135_2013_zcta.csv')
  census.in[,zip5 := ZCTA5A]
  
  
  # Topics
  # UEF: age by sex
  # UEQ: Race
  # UEZ: Hispanic or latino
  # UFF: Means of transport
  # UFH: Travel time
  # UGS: Educational attainment
  # UHD: Median household income
  # UJ8: Employment status
  # UMF: Rent as % of household income
  
  # Age by sex --> just keep total.
  data.temp <- census.in[,c('zip5','UEFE001'),with=F]
  names(data.temp) <- c('zip5','median.age')
  data.working <- data.temp
  
  # Race --> get total population and % white.
  data.temp <- census.in[,c('zip5','UEQE001','UEQE002'),with=F]
  data.temp[,pop.total := UEQE001]
  data.temp[,pct.white := UEQE002 / UEQE001]
  data.temp <- data.temp[,c('zip5','pop.total','pct.white'),with=F]
  data.working <- merge(data.working,data.temp,by='zip5')
  
  # Hispanic or latino -> get % latino
  data.temp <- census.in[,c('zip5','UEZE001','UEZE003'),with=F]
  data.temp[,pct.hispanic := UEZE003 / UEZE001]
  data.temp <- data.temp[,c('zip5','pct.hispanic'),with=F]
  data.working <- merge(data.working,data.temp,by='zip5')  
  
  # Transportation -> Get % car, % public, % walk or bike
  data.temp <- census.in[,c('zip5','UFFE001','UFFE002','UFFE010','UFFE018','UFFE019'),with=F]
  data.temp[,pct.commute.car := UFFE002 / UFFE001]
  data.temp[,pct.commute.public := UFFE010 / UFFE001]
  data.temp[,pct.commute.bike.walk := (UFFE018 + UFFE019) / UFFE001]
  data.temp <- data.temp[,c('zip5','pct.commute.car','pct.commute.public','pct.commute.bike.walk'),with=F]
  data.working <- merge(data.working,data.temp,by='zip5')
  
  # Travel time -> Get % with commute above 60 minutes
  data.temp <- census.in[,c('zip5','UFHE001','UFHE012','UFHE013'),with=F]
  data.temp[,pct.commute.above.60 := (UFHE012 + UFHE013) / UFHE001]
  data.temp <- data.temp[,c('zip5','pct.commute.above.60')]
  data.working <- merge(data.working,data.temp,by='zip5')  
  
  # Educational attainment -> Get % with bachelors or above
  data.temp <- census.in[,c('zip5','UGSE001','UGSE022','UGSE023','UGSE024','UGSE025'),with=F]
  data.temp[,pct.bachelors := (UGSE022 + UGSE023 + UGSE024 + UGSE025) / UGSE001]
  data.temp <- data.temp[,c('zip5','pct.bachelors'),with=F]
  data.working <- merge(data.working,data.temp,by='zip5')
  
  # Median household income --> Just use median household income.
  data.temp <- census.in[,c('zip5','UHDE001'),with=F]
  names(data.temp) <- c('zip5','median.household.income')
  data.working <- merge(data.working,data.temp,by='zip5')
  
  # Unemployment -> Use percentage of unemployed in civilian labor force
  data.temp <- census.in[,c('zip5','UJ8E003','UJ8E005'),with=F]
  data.temp[,pct.unemployed := UJ8E005 / UJ8E003]
  data.temp <- data.temp[,c('zip5','pct.unemployed'),with=F]
  data.working <- merge(data.working,data.temp,by='zip5')
  
  # Rental cost as fraction of income --> get percent paying over 50% of income
  data.temp <- census.in[,c('zip5','UMFE001','UMFE008','UMFE009','UMFE010'),with=F]
  data.temp[,pct.rent.over.50 := ( UMFE010) / UMFE001]
  data.temp <- data.temp[,c('zip5','pct.rent.over.50'),with=F]
  data.working <- merge(data.working,data.temp,by='zip5')
  
  # Return the result
  return(data.working)
}


getLastYearlyTransaction <- function() {
  transactions <- loadData() # Get the cleaned up corelogic transactions
  
  # Create cleaned buyername
  transactions[,buyername_cleaned := trimws(paste(owner1firstnamemi,owner1lastname))]
  transactions[,buyername_cleaned := trimws(gsub(pattern = '[^[:alnum:] ]',x = buyername_cleaned,replace = ''))]
  
  # All people who owned properties. First, filter out corporations and 
  transactions[,n := sum(absenteeownerstatus != 'Y'),by='buyername_cleaned'] # If you have a ton of absentee owner stuff we're skeptical.....
  transactions[,zip5 := substr(situszipcode,1,5)]
  transactions[,buy_year := year]
  transactions$year <- NULL
  
  # Get rid of corporations, people with transactions > 10, and guys with corporate names.
  corpNames = c('[0-9]|ARIZONA|CORP|LLC|INC|LP|SALES|CONSTRUCTION|TRUST|INC|unknown|HOMES|FNAM|DEV|JR|HUD|LTD|INS|MAE|ASSN|MARKETING|RECONVEYANCE|BK|FSB|CORPORATION|PROPERTIES|ADMN|ASSOCIATES|BANK|AMERICA|CORPS|DEVELOPMENT|AFFAIRS|INVESTMENTS|COUNTY|CORPORAT|HMS')
  transactions[,corpname := grepl(x = buyername_cleaned,pattern = corpNames,ignore.case = T)]  
  
  transactions[,last.yearly.transaction := max(date),by=c('fipscode','pclidirisfrmtd','buy_year')]
  last.yearly <- transactions[date == last.yearly.transaction ]
  
  # Get tha max sale amount . Get the biggest one.
  last.yearly[,max.sale.amount := max(saleamount),by=c('fipscode','pclidirisfrmtd','buy_year')]
  last.yearly <- last.yearly[saleamount == max.sale.amount]
  
  # Now if it's duplicated it's just bullshit. Just drop one of them.
  last.yearly <- last.yearly[!duplicated(paste(fipscode,pclidirisfrmtd,buy_year))]
  
  # Okay---each property has at most one transaction in a year. Want to get when that property is sold. This is the next thing in the transactions. Note we should have
  # the same number of properties.
  last.yearly <- last.yearly[order(fipscode,pclidirisfrmtd,buy_year)]
  last.yearly[,name := buyername_cleaned]
  
  
  return(last.yearly)
}

createSellerPanel()
